import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from scipy import stats
import folium
from folium.plugins import HeatMap
from IPython.display import display, clear_output
import ipywidgets as widgets
import networkx as nx
from geopy.distance import geodesic
import random
import time
import zipfile
C:\Users\teste\anaconda3\Lib\site-packages\pandas\core\arrays\masked.py:61: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed). from pandas.core import (
df = pd.read_csv(r"C:\Users\teste\Downloads\smart_traffic_management_dataset.csv")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2000 entries, 0 to 1999 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 timestamp 2000 non-null object 1 location_id 2000 non-null int64 2 traffic_volume 2000 non-null int64 3 avg_vehicle_speed 2000 non-null float64 4 vehicle_count_cars 2000 non-null int64 5 vehicle_count_trucks 2000 non-null int64 6 vehicle_count_bikes 2000 non-null int64 7 weather_condition 2000 non-null object 8 temperature 2000 non-null float64 9 humidity 2000 non-null float64 10 accident_reported 2000 non-null int64 11 signal_status 2000 non-null object dtypes: float64(3), int64(6), object(3) memory usage: 187.6+ KB
df.head()
| timestamp | location_id | traffic_volume | avg_vehicle_speed | vehicle_count_cars | vehicle_count_trucks | vehicle_count_bikes | weather_condition | temperature | humidity | accident_reported | signal_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2024-01-01 00:00:00 | 4 | 504 | 53.124162 | 142 | 24 | 44 | Cloudy | 33.334387 | 36.390698 | 0 | Red |
| 1 | 2024-01-01 00:01:00 | 5 | 209 | 44.947850 | 862 | 50 | 23 | Cloudy | 17.926830 | 37.640927 | 0 | Green |
| 2 | 2024-01-01 00:02:00 | 3 | 572 | 63.179229 | 317 | 12 | 10 | Windy | 33.483375 | 84.262610 | 1 | Red |
| 3 | 2024-01-01 00:03:00 | 5 | 699 | 42.269697 | 709 | 43 | 21 | Sunny | 19.212941 | 61.550978 | 0 | Yellow |
| 4 | 2024-01-01 00:04:00 | 5 | 639 | 72.185791 | 594 | 34 | 14 | Cloudy | 11.349244 | 77.494506 | 0 | Red |
df.describe()
| location_id | traffic_volume | avg_vehicle_speed | vehicle_count_cars | vehicle_count_trucks | vehicle_count_bikes | temperature | humidity | accident_reported | |
|---|---|---|---|---|---|---|---|---|---|
| count | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 |
| mean | 2.991500 | 540.959000 | 50.009035 | 449.872500 | 49.269000 | 24.117000 | 22.447538 | 60.105652 | 0.054000 |
| std | 1.430892 | 271.933985 | 17.267898 | 253.695741 | 28.830196 | 14.297011 | 7.200535 | 17.245294 | 0.226074 |
| min | 1.000000 | 50.000000 | 20.011192 | 20.000000 | 0.000000 | 0.000000 | 10.025557 | 30.007119 | 0.000000 |
| 25% | 2.000000 | 309.000000 | 35.168589 | 227.750000 | 24.000000 | 12.000000 | 16.410169 | 45.179396 | 0.000000 |
| 50% | 3.000000 | 549.000000 | 50.412652 | 452.000000 | 49.000000 | 24.000000 | 22.364844 | 60.450407 | 0.000000 |
| 75% | 4.000000 | 774.000000 | 65.135366 | 660.250000 | 74.000000 | 36.000000 | 28.563564 | 74.805580 | 0.000000 |
| max | 5.000000 | 998.000000 | 79.972635 | 899.000000 | 99.000000 | 49.000000 | 34.990891 | 89.989003 | 1.000000 |
df['timestamp'] = pd.to_datetime(df['timestamp'])
label_encoder = LabelEncoder()
df['weather_condition'] = label_encoder.fit_transform(df['weather_condition'])
df['accident_reported'] = df['accident_reported'].map({'Yes':1, 'No':0})
df['signal_status'] = df['signal_status'].map({'Green':0,'Yellow':1,'Red':2})
df.fillna(df.mean(), inplace=True)
numeric_features = ['traffic_volume','avg_vehicle_speed','vehicle_count_cars',
'vehicle_count_trucks','vehicle_count_bikes','temperature',
'humidity','accident_reported','signal_status']
def eda_overview():
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
print(df.info())
print("\nDataset Description:")
display(df.describe())
print("\nMissing Values per Column:")
print(df.isnull().sum())
print("\nDistribution of Signal Status:")
sns.countplot(x='signal_status', data=df)
plt.show()
eda_overview()
Dataset Shape: (2000, 12) Dataset Info: <class 'pandas.core.frame.DataFrame'> RangeIndex: 2000 entries, 0 to 1999 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 timestamp 2000 non-null datetime64[ns] 1 location_id 2000 non-null int64 2 traffic_volume 2000 non-null int64 3 avg_vehicle_speed 2000 non-null float64 4 vehicle_count_cars 2000 non-null int64 5 vehicle_count_trucks 2000 non-null int64 6 vehicle_count_bikes 2000 non-null int64 7 weather_condition 2000 non-null int32 8 temperature 2000 non-null float64 9 humidity 2000 non-null float64 10 accident_reported 0 non-null float64 11 signal_status 2000 non-null int64 dtypes: datetime64[ns](1), float64(4), int32(1), int64(6) memory usage: 179.8 KB None Dataset Description:
| timestamp | location_id | traffic_volume | avg_vehicle_speed | vehicle_count_cars | vehicle_count_trucks | vehicle_count_bikes | weather_condition | temperature | humidity | accident_reported | signal_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 0.0 | 2000.00000 |
| mean | 2024-01-01 16:39:29.999999744 | 2.991500 | 540.959000 | 50.009035 | 449.872500 | 49.269000 | 24.117000 | 2.016500 | 22.447538 | 60.105652 | NaN | 1.03700 |
| min | 2024-01-01 00:00:00 | 1.000000 | 50.000000 | 20.011192 | 20.000000 | 0.000000 | 0.000000 | 0.000000 | 10.025557 | 30.007119 | NaN | 0.00000 |
| 25% | 2024-01-01 08:19:45 | 2.000000 | 309.000000 | 35.168589 | 227.750000 | 24.000000 | 12.000000 | 1.000000 | 16.410169 | 45.179396 | NaN | 0.00000 |
| 50% | 2024-01-01 16:39:30 | 3.000000 | 549.000000 | 50.412652 | 452.000000 | 49.000000 | 24.000000 | 2.000000 | 22.364844 | 60.450407 | NaN | 1.00000 |
| 75% | 2024-01-02 00:59:15 | 4.000000 | 774.000000 | 65.135366 | 660.250000 | 74.000000 | 36.000000 | 3.000000 | 28.563564 | 74.805580 | NaN | 2.00000 |
| max | 2024-01-02 09:19:00 | 5.000000 | 998.000000 | 79.972635 | 899.000000 | 99.000000 | 49.000000 | 4.000000 | 34.990891 | 89.989003 | NaN | 2.00000 |
| std | NaN | 1.430892 | 271.933985 | 17.267898 | 253.695741 | 28.830196 | 14.297011 | 1.449234 | 7.200535 | 17.245294 | NaN | 0.80309 |
Missing Values per Column: timestamp 0 location_id 0 traffic_volume 0 avg_vehicle_speed 0 vehicle_count_cars 0 vehicle_count_trucks 0 vehicle_count_bikes 0 weather_condition 0 temperature 0 humidity 0 accident_reported 2000 signal_status 0 dtype: int64 Distribution of Signal Status:
univariate_dropdown = widgets.Dropdown(
options=numeric_features,
description='Feature:',
value='traffic_volume'
)
def plot_univariate(feature):
plt.figure(figsize=(6,4))
sns.histplot(df[feature], kde=True, bins=20)
plt.title(f'Univariate Analysis: {feature}')
plt.show()
plt.figure(figsize=(6,4))
sns.boxplot(x=df[feature])
plt.title(f'Boxplot: {feature}')
plt.show()
display(widgets.interactive(plot_univariate, feature=univariate_dropdown))
interactive(children=(Dropdown(description='Feature:', options=('traffic_volume', 'avg_vehicle_speed', 'vehicl…
x_dropdown = widgets.Dropdown(options=numeric_features, description='X-axis:', value='traffic_volume')
y_dropdown = widgets.Dropdown(options=numeric_features, description='Y-axis:', value='avg_vehicle_speed')
def plot_bivariate(x_feature, y_feature):
plt.figure(figsize=(6,4))
sns.scatterplot(x=df[x_feature], y=df[y_feature], hue=df['signal_status'], palette='coolwarm')
plt.title(f'Bivariate Analysis: {x_feature} vs {y_feature}')
plt.show()
display(widgets.interactive(plot_bivariate, x_feature=x_dropdown, y_feature=y_dropdown))
interactive(children=(Dropdown(description='X-axis:', options=('traffic_volume', 'avg_vehicle_speed', 'vehicle…
def plot_correlation():
plt.figure(figsize=(8,6))
sns.heatmap(df[numeric_features].corr(), annot=True, cmap='coolwarm')
plt.title("Multivariate Correlation Heatmap")
plt.show()
plot_correlation()
z_scores = np.abs(stats.zscore(df[numeric_features]))
outliers = (z_scores > 3)
df_outliers = df[(outliers).any(axis=1)]
outlier_dropdown = widgets.Dropdown(options=numeric_features, description='Feature:')
def show_outliers(feature):
outlier_values = df_outliers[feature]
print(f"Number of outliers in {feature}: {len(outlier_values)}")
display(outlier_values)
display(widgets.interactive(show_outliers, feature=outlier_dropdown))
interactive(children=(Dropdown(description='Feature:', options=('traffic_volume', 'avg_vehicle_speed', 'vehicl…
numeric_cols = ['traffic_volume', 'avg_vehicle_speed',
'vehicle_count_cars', 'vehicle_count_trucks',
'vehicle_count_bikes', 'temperature', 'humidity']
# Create a boxplot for all numeric columns
plt.figure(figsize=(12,6))
sns.boxplot(data=df[numeric_cols])
plt.title("Outlier Detection in Traffic Dataset")
plt.xticks(rotation=45)
plt.show()
def plot_hotspot(min_signal_status):
m = folium.Map(location=[df['location_id'].mean(), df['location_id'].mean()], zoom_start=12)
heat_data = [[row['location_id'], row['location_id']] for idx, row in df.iterrows() if row['signal_status']>=min_signal_status]
HeatMap(heat_data).add_to(m)
display(m)
hotspot_slider = widgets.IntSlider(min=0, max=2, step=1, value=2, description='Min Signal Status:')
display(widgets.interactive(plot_hotspot, min_signal_status=hotspot_slider))
interactive(children=(IntSlider(value=2, description='Min Signal Status:', max=2), Output()), _dom_classes=('w…
features = ['traffic_volume','avg_vehicle_speed','vehicle_count_cars','vehicle_count_trucks',
'vehicle_count_bikes','temperature','humidity','accident_reported']
target = 'signal_status'
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
def predict_signal_status(traffic_volume, avg_vehicle_speed, vehicle_count_cars, vehicle_count_trucks,
vehicle_count_bikes, temperature, humidity, accident_reported):
accident_val = 1 if accident_reported=='Yes' else 0
new_data = pd.DataFrame({'traffic_volume':[traffic_volume],
'avg_vehicle_speed':[avg_vehicle_speed],
'vehicle_count_cars':[vehicle_count_cars],
'vehicle_count_trucks':[vehicle_count_trucks],
'vehicle_count_bikes':[vehicle_count_bikes],
'temperature':[temperature],
'humidity':[humidity],
'accident_reported':[accident_val]})
pred = model.predict(new_data)[0]
mapping = {0:'Green', 1:'Yellow', 2:'Red'}
print(f"Predicted Signal Status: {mapping[pred]}")
display(widgets.interactive(predict_signal_status,
traffic_volume=widgets.IntSlider(min=0, max=1000, value=100, step=1),
avg_vehicle_speed=widgets.IntSlider(min=0,max=120,value=30,step=1),
vehicle_count_cars=widgets.IntSlider(min=0,max=500,value=50,step=1),
vehicle_count_trucks=widgets.IntSlider(min=0,max=200,value=10,step=1),
vehicle_count_bikes=widgets.IntSlider(min=0,max=300,value=20,step=1),
temperature=widgets.IntSlider(min=-10,max=50,value=25,step=1),
humidity=widgets.IntSlider(min=0,max=100,value=50,step=1),
accident_reported=widgets.Dropdown(options=['Yes','No'],description='Accident')
))
interactive(children=(IntSlider(value=100, description='traffic_volume', max=1000), IntSlider(value=30, descri…
unique_locations = df['location_id'].unique()
location_coords = {}
np.random.seed(42)
for loc in unique_locations:
# Random coordinates for simulation
location_coords[loc] = (12.9 + np.random.rand()/10, 77.5 + np.random.rand()/10)
df['Latitude'] = df['location_id'].map(lambda x: location_coords[x][0])
df['Longitude'] = df['location_id'].map(lambda x: location_coords[x][1])
def simple_route(start_loc, end_loc, min_signal_status=2):
start_lat, start_lon = location_coords[start_loc]
end_lat, end_lon = location_coords[end_loc]
m = folium.Map(location=[(start_lat+end_lat)/2, (start_lon+end_lon)/2], zoom_start=13)
# Mark start and end
folium.Marker([start_lat, start_lon], tooltip="Start", icon=folium.Icon(color='green')).add_to(m)
folium.Marker([end_lat, end_lon], tooltip="End", icon=folium.Icon(color='red')).add_to(m)
# Hotspots
hotspot_coords = df[df['signal_status']>=min_signal_status][['Latitude','Longitude']].values
for coord in hotspot_coords:
folium.CircleMarker(location=coord, radius=5, color='orange', fill=True, fill_opacity=0.7).add_to(m)
# Simple straight-line route avoiding hotspots
route_points = [[start_lat, start_lon]]
steps = 10
lat_step = (end_lat - start_lat)/steps
lon_step = (end_lon - start_lon)/steps
current_point = [start_lat, start_lon]
for i in range(1, steps+1):
next_point = [current_point[0]+lat_step, current_point[1]+lon_step]
for h in hotspot_coords:
if geodesic(next_point,h).km < 0.5:
next_point[0] += 0.001
next_point[1] += 0.001
route_points.append(next_point)
current_point = next_point
folium.PolyLine(route_points, color='blue', weight=4, opacity=0.7, tooltip='Route').add_to(m)
display(m)
display(widgets.interactive(simple_route,
start_loc=widgets.Dropdown(options=unique_locations, description='Start Location'),
end_loc=widgets.Dropdown(options=unique_locations, description='End Location'),
min_signal_status=widgets.IntSlider(min=0,max=2,step=1,value=2)
))
interactive(children=(Dropdown(description='Start Location', options=(4, 5, 3, 2, 1), value=4), Dropdown(descr…
G = nx.Graph()
for idx, row in df.iterrows():
G.add_node(row['location_id'], pos=(row['Latitude'], row['Longitude']))
# Connect locations within threshold distance
threshold_km = 1 # 1 km radius
for i in unique_locations:
for j in unique_locations:
if i==j: continue
coord_i = location_coords[i]
coord_j = location_coords[j]
distance = geodesic(coord_i, coord_j).km
if distance <= threshold_km:
# Weight includes congestion penalty
sig_i = df[df['location_id']==i]['signal_status'].mean()
sig_j = df[df['location_id']==j]['signal_status'].mean()
weight = distance*(1 + sig_i + sig_j)
G.add_edge(i,j,weight=weight)
def optimized_shortest_path(start_loc, end_loc):
path = nx.dijkstra_path(G, start_loc, end_loc, weight='weight')
route_coords = [location_coords[p] for p in path]
m = folium.Map(location=[np.mean([c[0] for c in route_coords]), np.mean([c[1] for c in route_coords])], zoom_start=13)
folium.PolyLine(route_coords, color='blue', weight=4, opacity=0.7, tooltip='Optimized Route').add_to(m)
folium.Marker(location_coords[start_loc], tooltip='Start', icon=folium.Icon(color='green')).add_to(m)
folium.Marker(location_coords[end_loc], tooltip='End', icon=folium.Icon(color='red')).add_to(m)
hotspot_coords = df[df['signal_status']>=2][['Latitude','Longitude']].values
for coord in hotspot_coords:
folium.CircleMarker(location=coord,radius=5,color='orange',fill=True,fill_opacity=0.7,tooltip='Hotspot').add_to(m)
display(m)
display(widgets.interactive(optimized_shortest_path,
start_loc=widgets.Dropdown(options=unique_locations, description='Start Location'),
end_loc=widgets.Dropdown(options=unique_locations, description='End Location')
))
interactive(children=(Dropdown(description='Start Location', options=(4, 5, 3, 2, 1), value=4), Dropdown(descr…
from folium.plugins import TimestampedGeoJson
def simulate_real_time_animated(df, steps=10):
# Create a list of GeoJSON features
features = []
df_sim = df.copy()
for step in range(steps):
# Randomly update traffic_volume and signal_status
df_sim['traffic_volume'] = df_sim['traffic_volume'].apply(lambda x: max(0, x + random.randint(-10,10)))
df_sim['signal_status'] = df_sim['traffic_volume'].apply(lambda x: 2 if x>500 else (1 if x>250 else 0))
timestamp = pd.Timestamp.now() + pd.Timedelta(minutes=step*5)
for idx, row in df_sim.iterrows():
if row['signal_status'] >= 1: # Only show moderate/high congestion
feature = {
'type': 'Feature',
'geometry': {
'type': 'Point',
'coordinates': [row['Longitude'], row['Latitude']]
},
'properties': {
'time': timestamp.isoformat(),
'style': {'color': 'red' if row['signal_status']==2 else 'orange'},
'icon': 'circle',
'iconstyle':{
'fillColor': 'red' if row['signal_status']==2 else 'orange',
'fillOpacity': 0.7,
'radius': 6
}
}
}
features.append(feature)
# Create a base map
m = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=13)
TimestampedGeoJson({
'type': 'FeatureCollection',
'features': features,
}, period='PT5M', add_last_point=True, auto_play=True, loop=False, max_speed=1).add_to(m)
display(m)
# Run simulation
simulate_real_time_animated(df, steps=10)